This document is to analyse the Expression of the ubiquitin-related genes, immunoproteasome, HLA molecules, etc. and see if there is a pattern that distinguishes responders from non-responders.
### Load packages, functions and paths
source(file = "../01_install.R")
source(file = "../02_functions.R")
source(file = "../03_paths.R")
### Read data & wrangle
# Clincal metadata
clinical_data_path <- file.path(results_pc_path,
"clinical_metadata.csv")
clinical_df <- read.csv(clinical_data_path, header = TRUE)[ , -1]
# Save into RDS file
clinical_metadata_path <- file.path(results_expression_path,
"clinical_metadata.rds")
saveRDS(clinical_df, file = clinical_metadata_path)
# TPM matrix
mm909_path <- file.path(data_path,
"MM909_data.RData")
load(mm909_path)
tpm_matrix <- tpm_ensg
# Get hugo_symbols
hugo_symbols_tpm <- rownames(tpm_matrix)
## Filtering out patients with missing data from rna_seq_raw and clinical_df
tpm_matrix_columns <- names(tpm_matrix)
# Subset clinical_df
clinical_df <- clinical_df %>%
dplyr::filter(Sample.ID %in% tpm_matrix_columns)
# Rename Sample.ID column
clinical_df <- clinical_df %>%
dplyr::rename(SampleID = Sample.ID,
Response = Patient.Response,
TMB = mut_load)
# Add hugo_symbols to df
tpm_matrix <- tpm_matrix %>%
dplyr::mutate(hgnc_symbol = hugo_symbols_tpm)
## Convert Hugo symbols to ensembl gene id
#1 Use grch37
ensembl_ids <- SYMBOLtoENSEMBL(hugo_symbols_tpm)
# Filter ensembl gene ids
ensembl_ids_tpm <- ensembl_ids[str_detect(ensembl_ids, "^ENSG")]
remaining_hugo <- ensembl_ids[!str_detect(ensembl_ids, "^ENSG")]
#2 Use grch38 for remaining
ensembl_ids <- SYMBOLtoENSEMBL38(remaining_hugo)
# Filter ensembl gene ids
ensembl_ids_tpm2 <- ensembl_ids[str_detect(ensembl_ids, "^ENSG")]
remaining_hugo <- ensembl_ids[!str_detect(ensembl_ids, "^ENSG")]
# (tried using biomaRt with grch37 but it couldn't map more hugo symbols to ensembl gene ids)
#3 Use biomaRt with grch38
# Connect to the GO BioMart database
ensembl = biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL",
host="https://www.ensembl.org",
path="/biomart/martservice",
dataset="hsapiens_gene_ensembl")
# Retrieve hugo & Ensembl gene ids
hugo2ensembl_biomart_res <- biomaRt::getBM(attributes = c("external_gene_name",
"ensembl_gene_id"),
filters = "external_gene_name",
values = remaining_hugo,
mart = ensembl)
# Make sure 1 to 1 mapping
hugo2ensembl_biomart_df <- hugo2ensembl_biomart_res %>%
dplyr::rename(hgnc_symbol = external_gene_name) %>%
dplyr::arrange(hgnc_symbol, ensembl_gene_id) %>%
dplyr::group_by(hgnc_symbol) %>%
dplyr::slice(1) %>%
dplyr::ungroup()
#4 Merge results
# Convert character vectors to data.frames
df1 <- data.frame(ensembl_gene_id = ensembl_ids_tpm,
hgnc_symbol = names(ensembl_ids_tpm),
stringsAsFactors = FALSE)
df2 <- data.frame(ensembl_gene_id = ensembl_ids_tpm2,
hgnc_symbol = names(ensembl_ids_tpm2),
stringsAsFactors = FALSE)
# Combine all data.frames
combined_df <- rbind(df1, df2, hugo2ensembl_biomart_df)
# Ensure each hgnc_symbol maps to only one ensembl_gene_id
final_ensembl_tpm <- combined_df %>%
dplyr::group_by(hgnc_symbol) %>%
dplyr::summarise(ensembl_gene_id = dplyr::first(ensembl_gene_id)) %>%
dplyr::ungroup()
# Find out missing hugo symbols & convert aliases
remaining_hugo <- setdiff(hugo_symbols_tpm, final_ensembl_tpm$hgnc_symbol)
corrected_remaining_hugo <- alias2SymbolTable(remaining_hugo, species = "Hs")
# Convert character vector to data.frame
df3 <- data.frame(hgnc_symbol = remaining_hugo,
corrected_hugo = corrected_remaining_hugo,
stringsAsFactors = FALSE)
# Use biomaRt again
hugo2ensembl_biomart_res <- biomaRt::getBM(attributes = c("external_gene_name",
"ensembl_gene_id"),
filters = "external_gene_name",
values = df3$corrected_hugo,
mart = ensembl)
# Make sure 1 to 1 mapping
hugo2ensembl_biomart_df <- hugo2ensembl_biomart_res %>%
dplyr::rename(corrected_hugo = external_gene_name) %>%
dplyr::arrange(corrected_hugo, ensembl_gene_id) %>%
dplyr::group_by(corrected_hugo) %>%
dplyr::slice(1) %>%
dplyr::ungroup()
# Merge original hugo symbol
df3 <- df3 %>%
dplyr::left_join(hugo2ensembl_biomart_df, by="corrected_hugo") %>%
dplyr::distinct()
remaining_hugo <- df3 %>% dplyr::filter(is.na(ensembl_gene_id))
# Manually annotate if I can
remaining_hugo <- remaining_hugo %>%
dplyr::mutate(
ensembl_gene_id = case_when(
hgnc_symbol == "DDIT4-AS1" ~ "ENSG00000269926",
hgnc_symbol == "LINC01869" ~ "ENSG00000180279",
hgnc_symbol == "LINC02310" ~ "ENSG00000258537",
hgnc_symbol == "LINC02618" ~ "ENSG00000293386",
hgnc_symbol == "LINC02677" ~ "ENSG00000242147"
)
)
# Merge
df3 <- df3 %>%
dplyr::left_join(remaining_hugo, by=c("hgnc_symbol", "corrected_hugo")) %>%
dplyr::mutate(ensembl_gene_id = coalesce(ensembl_gene_id.y, ensembl_gene_id.x)) %>% # Prefer 'remaining_hugo' values
dplyr::select(-ensembl_gene_id.x, -ensembl_gene_id.y) %>% # Remove the temporary columns
dplyr::distinct()
remaining_hugo <- df3 %>% dplyr::filter(is.na(ensembl_gene_id))
# 6 hugo symbols remaining that are deprecated
# Remove NAs
final_df3 <- df3 %>%
dplyr::filter(!is.na(ensembl_gene_id)) %>%
dplyr::select(-corrected_hugo)
# Merge with final df
final_ensembl_tpm <- rbind(final_ensembl_tpm, final_df3)
# Put ensembl gene id to tpm_matrix
final_tpm_matrix <- tpm_matrix %>%
dplyr::left_join(final_ensembl_tpm, by="hgnc_symbol") %>%
dplyr::filter(!is.na(ensembl_gene_id))
genes <- final_tpm_matrix$ensembl_gene_id
tpm_matrix <- final_tpm_matrix %>%
dplyr::select(-hgnc_symbol, -ensembl_gene_id) %>%
as.matrix()
rownames(tpm_matrix) <- genes
tpm_matrix_def <- removeDuplicateGenesDF(tpm_matrix)
# Merge in a list with names = (raw, norm, fpkm, tpm, Clinical)
input_list_expression <- list(
"raw" = tpm_matrix_def,
"norm" = tpm_matrix_def,
"fpkm" = tpm_matrix_def,
"tpm" = tpm_matrix_def,
"cpm" = tpm_matrix_def,
"Clinical" = clinical_df)
# Save into RDS file
input_list_path <- file.path(results_expression_path,
"input_list_expression.rds")
saveRDS(input_list_expression, file = input_list_path)
# Save TPM matrix only
input_tpm_path <- file.path(results_expression_path,
"tpm_matrix.rds")
saveRDS(tpm_matrix_def, file = input_tpm_path)
biomarkers_out <- readRDS(file.path(data_path, "../MM909_MEL_TPM_biomarkers.rds"))
# Add clinical metadata & new response categories
biomarkers_clinical <- biomarkers_out %>%
dplyr::left_join(clinical_df, by="SampleID") %>%
dplyr::mutate(resp_3_cat = case_when(
RECIST %in% c("CR", "PR") ~ "R",
RECIST %in% c("SD") ~ "SD",
RECIST %in% c("PD") ~ "NR",
TRUE ~ NA_character_
))
# Process ubi info and merge
annot_lof_ub_path <- file.path(results_ubfora_path,
"annot_lof_ub.rds")
annot_lof_ub_df <- readRDS(annot_lof_ub_path)
cat_lof_ub_df <- annot_lof_ub_df %>%
dplyr::mutate(ub_category = case_when(
category %in% c("E1_genes", "E2_genes", "E3_genes") ~ "ubiquitination",
category %in% c("C19_genes", "dub_genes") ~ "deubiquitination",
TRUE ~ NA_character_
))
# Save RDS
cat_lof_ub_path <- file.path(results_expression_path,
"cat_lof_ub.rds")
saveRDS(cat_lof_ub_df, file = cat_lof_ub_path)
reduced_cat_lof_ub_df <- cat_lof_ub_df %>%
dplyr::select(sample_id, ub_category) %>%
dplyr::group_by(sample_id) %>%
dplyr::summarise(
ubi_lof = as.integer(any(ub_category == "ubiquitination")),
deubi_lof = as.integer(any(ub_category == "deubiquitination"))
)
# Add ubiquitin lof info
biomarkers_clinical_ubi <- biomarkers_clinical %>%
dplyr::left_join(reduced_cat_lof_ub_df, by=c("SampleID" = "sample_id")) %>%
tidyr::replace_na(list(ubi_lof = 0, deubi_lof = 0))
# Add ORA results
fora_results_lof_csv_path <- file.path(results_ubfora_path, "fora_results_ubiquitin_lof.csv")
fora_results_ubiquitin_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]
fora_ubi_lof_clean_df <- fora_results_ubiquitin_lof_df %>%
dplyr::rename(count = overlap) %>%
dplyr::select(sample_id, pathway, count, size) %>%
dplyr::mutate(gene_ratio = count / size) %>%
dplyr::filter(!sample_id %in% c("MM909_20", "MM909_24"))
temp_data_df <- fora_ubi_lof_clean_df %>%
dplyr::left_join(biomarkers_clinical_ubi %>% dplyr::select(SampleID, TumorPurity),
by=c("sample_id" = "SampleID"))
# Response df
response_df <- biomarkers_clinical_ubi %>%
dplyr::rename(sample_id = SampleID) %>%
dplyr::select(sample_id, Response) %>%
dplyr::distinct()
# Summarize the ubi counts
summarized_data <- temp_data_df %>%
dplyr::group_by(sample_id) %>%
dplyr::summarize(
ubi_counts = sum(count[pathway %in% c("Ubiquitin-protein ligase family", "Ubiquitin-activating enzymes", "Ubiquitin-conjugating enzymes")]),
deubi_counts = sum(count[pathway %in% c("Deubiquitinases", "Peptidase family C19")])
)
# Prepare input full df
biomarkers_clinical_ubi_counts <- biomarkers_clinical_ubi %>%
dplyr::rename(IMPRES = IMRES,
IP_Wu = IP_score) %>%
dplyr::left_join(summarized_data, by=c("SampleID" = "sample_id"))
# Save into RDS file
full_df_path <- file.path(results_expression_path,
"biomarkers_clinical_ubi_counts.rds")
saveRDS(biomarkers_clinical_ubi_counts, file = full_df_path)
# Datatable of output
datatable(biomarkers_clinical_ubi_counts,
extensions = c('Buttons',
'FixedColumns'),
options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv'),
scrollX=TRUE,
pageLength=8,
fixedColumns = list(leftColumns = 2), # Freeze the first 2 columns
columnDefs = list(list(
targets = "_all",
render = JS(
"function(data, type, row, meta) {",
" return data === null ? 'NA' : data;",
"}"
)
))
),
caption = "Expression analysis of Cancer Immunity Cycle biomarkers merged with clinical metadata and LoF ubi mutations (binary and count)."
)
The numeric biomarkers selected to evaluate Cancer Immunity Cycle are the following:
Here I compare APM_Senb with APMS_Thomson and IP_Kalaora with IP_Wu to decide only 1 of each pair since they are measuring the same and keeping both would be redundant.
# Loop through the biomarkers and generate a boxplot for each
plots <- list()
for (biomarker in c("APM_Senb", "APMS_Thomson", "IP_Kalaora", "IP_Wu")) {
p <- create_biomarker_boxplot(biomarkers_clinical_ubi_counts, biomarker)
plots[[biomarker]] <- p
}
# Print the plots
for (biomarker in c("APM_Senb", "APMS_Thomson", "IP_Kalaora", "IP_Wu")) {
print(plots[[biomarker]])
}
Since none is statistically significant, I chose to keep APM_Senb because it is based on HLA genes and IP_Wu because it takes into account the 3 immunoproteasome subunits.
# Selection of numeric biomarkers
biomarkers_numeric_df <- biomarkers_clinical_ubi_counts %>%
dplyr::select(SampleID, Response, resp_3_cat, Release_of_cancer_cell_antigens_S1_TIP, Cancer_antigen_presentation_S2_TIP, APM_Senb, IP_Wu, MIAS, Priming_and_activation_S3_TIP, Trafficking_of_immune_cells_into_tumors_overall_S4_TIP, Infiltration_of_immune_cells_into_tumors_S5_TIP, Cytotoxic_cells_cTME, Dendritic_cells_cTME, B_cells_cTME, T_cells_CD8_cTME, T_cells_CD4_cTME, T_regulatory_cells_cTME, Dysfunction, Exclusion, Recognition_of_cancer_cells_by_T_cells_S6_TIP, IMPRES, Killing_of_cancer_cells_S7_TIP, ImmuneCyt_Davoli, Tinflam_Ayers, TumorPurity)
biomarkers_numeric_vec <- colnames(biomarkers_numeric_df)[-c(1, 2, 3)]
# Selection of categorical biomarkers
biomarkers_categorical_df <- biomarkers_clinical_ubi_counts %>%
dplyr::select(SampleID, Response, resp_3_cat, Ecotype, TME) %>%
dplyr::mutate(Response = as.factor(Response),
resp_3_cat = as.factor(resp_3_cat),
TME = as.factor(TME),
Ecotype = as.factor(Ecotype))
biomarkers_categorical_vec <- colnames(biomarkers_categorical_df)[-c(1,2,3)]
# Selection of ubiquitin features
biomarkers_ubiquitin_df <- biomarkers_clinical_ubi_counts %>%
dplyr::select(SampleID, Response, resp_3_cat, ubi_lof, deubi_lof, ubi_counts, deubi_counts)
biomarkers_ubiquitin_vec <- colnames(biomarkers_categorical_df)[-c(1,2,3)]
# Individual genes pre-processing
hugo_ensembl_ub <- annot_lof_ub_df %>%
dplyr::select(corrected_hugo_symbol, ensembl_id) %>%
dplyr::distinct()
ensembl_ub_vec <- hugo_ensembl_ub %>%
dplyr::pull(ensembl_id)
## Ubiquitin-related genes with LoF in my data
hugo_ub_vec <- hugo_ensembl_ub %>%
dplyr::pull(corrected_hugo_symbol)
## Save into RDS file
ensembl_ub_path <- file.path(results_expression_path,
"ensembl_ub_vector.rds")
saveRDS(ensembl_ub_vec, file = ensembl_ub_path)
hugo_ub_path <- file.path(results_expression_path,
"hugo_ub_vector.rds")
saveRDS(hugo_ub_vec, file = hugo_ub_path)
## Proteasome subunit HGNC symbols
proteas_vec <- c("PSMB5", "PSMB6", "PSMB7", "PSMB8", "PSMB9", "PSMB10")
# Selection of ubiquitin genes
biomarkers_ubi_exp_df <- biomarkers_clinical_ubi_counts %>%
dplyr::select(SampleID, Response, resp_3_cat, one_of(hugo_ub_vec))
biomarkers_ubi_exp_vec <- colnames(biomarkers_ubi_exp_df)[-c(1,2,3)]
# Selection of proteasome genes
biomarkers_proteas_exp_df <- biomarkers_clinical_ubi_counts %>%
dplyr::select(SampleID, Response, resp_3_cat, one_of(proteas_vec))
biomarkers_proteas_exp_vec <- colnames(biomarkers_proteas_exp_df)[-c(1,2,3)]
# Fig15
# Filter the data
df2cor <- biomarkers_numeric_df %>%
dplyr::select(-SampleID, -Response, -resp_3_cat)
# Compute the correlation matrix
cor_matrix <- cor(df2cor, use = "pairwise.complete.obs", method = "pearson")
# Compute Euclidean distance from the correlation matrix
dist_matrix <- dist(cor_matrix, method = "euclidean")
# Perform hierarchical clustering
hc <- hclust(dist_matrix, method = "complete")
# Order the correlation matrix according to the clustering
ordered_cor_matrix <- cor_matrix[hc$order, hc$order]
# Visualize the correlation matrix with clustering and reduced text size
plot_path <- file.path(figures_expression_path, "biomarkers_correlation.png")
png(plot_path, width = 1000, height = 800)
corrplot(ordered_cor_matrix, method = "color", type = "upper", order = "original",
tl.col = "black", tl.srt = 45, addCoef.col = "black", number.cex = 0.65, tl.cex = 0.8) # With numbers
# corrplot(ordered_cor_matrix, method = "color", type = "upper", order = "original",
# tl.col = "black", tl.srt = 45) # Without numbers
# Add title to the plot
title(main = NULL, cex.main = 1.5)
dev.off()
## quartz_off_screen
## 2
knitr::include_graphics(plot_path)
Selected pheatmap_biomarkers_split_response_ubi_binary for thesis, but can uncomment to see other options
# Fig17
# # pheatmap_biomarkers_both_response_ubi_binary
# p1 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "Response", ubi_counts = FALSE, split_response = FALSE)
#
# # Save the plot using the png function
# plot_path <- file.path(figures_expression_path, "pheatmap_biomarkers_both_response_ubi_binary.png")
# png(filename = plot_path, width = 8, height = 6, units = "in", res = 300)
# print(p1)
# dev.off()
# pheatmap_biomarkers_split_response_ubi_binary
p2 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "Response", ubi_counts = FALSE, split_response = TRUE)
print(p2)
# Save the ggplot
plot_path <- file.path(figures_expression_path,
"pheatmap_biomarkers_split_response_ubi_binary.png")
ggsave(plot_path, p2, width = 24, height = 10, dpi = 300)
# # pheatmap_biomarkers_both_response_ubi_counts
# p3 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "Response", ubi_counts = TRUE, split_response = FALSE)
#
# # Save the plot using the png function
# plot_path <- file.path(figures_expression_path, "pheatmap_biomarkers_both_response_ubi_counts.png")
# png(filename = plot_path, width = 10, height = 6, units = "in", res = 300)
# print(p3)
# dev.off()
#
# # pheatmap_biomarkers_split_response_ubi_counts
# p4 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "Response", ubi_counts = TRUE, split_response = TRUE)
#
# # Save the ggplot
# plot_path <- file.path(figures_expression_path,
# "pheatmap_biomarkers_split_response_ubi_counts.png")
# ggsave(plot_path, p4, width = 16, height = 10, dpi = 300)
#
# # pheatmap_biomarkers_both_response_ubi_binary
# p5 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "resp_3_cat", ubi_counts = FALSE, split_response = FALSE)
#
# # Save the plot using the png function
# plot_path <- file.path(figures_expression_path, "pheatmap_biomarkers_three_response_ubi_binary.png")
# png(filename = plot_path, width = 8, height = 6, units = "in", res = 300)
# print(p5)
# dev.off()
#
# # pheatmap_biomarkers_split_response_ubi_binary
# p6 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "resp_3_cat", ubi_counts = FALSE, split_response = TRUE)
#
# # Save the ggplot
# plot_path <- file.path(figures_expression_path,
# "pheatmap_biomarkers_three_split_response_ubi_binary.png")
# ggsave(plot_path, p6, width = 24, height = 10, dpi = 300)
#
# # pheatmap_biomarkers_both_response_ubi_counts
# p7 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "resp_3_cat", ubi_counts = TRUE, split_response = FALSE)
#
# # Save the plot using the png function
# plot_path <- file.path(figures_expression_path, "pheatmap_biomarkers_three_response_ubi_counts.png")
# png(filename = plot_path, width = 10, height = 6, units = "in", res = 300)
# print(p7)
# dev.off()
#
# # pheatmap_biomarkers_split_response_ubi_counts
# p8 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "resp_3_cat", ubi_counts = TRUE, split_response = TRUE)
#
# # Save the ggplot
# plot_path <- file.path(figures_expression_path,
# "pheatmap_biomarkers_three_split_response_ubi_counts.png")
# ggsave(plot_path, p8, width = 24, height = 10, dpi = 300)
# Fig14
# Normalize biomarkers expression
ub_expression <- biomarkers_ubi_exp_df
ub_expression_vec <- biomarkers_ubi_exp_vec
## Select all columns except the first two for normalization
columns_to_normalize <- colnames(ub_expression)[-(1:3)]
## Perform min-max normalization on the selected columns
ub_expression[columns_to_normalize] <- lapply(ub_expression[columns_to_normalize], function(x) (x - min(x)) / (max(x) - min(x)))
# Pivot long
ub_expression_long <- ub_expression %>%
tidyr::pivot_longer(cols = -c(SampleID, Response, resp_3_cat),
names_to = "corrected_hugo_symbol",
values_to = "expression")
gene_ub_cat <- cat_lof_ub_df %>%
dplyr::select(corrected_hugo_symbol, category, ub_category) %>%
dplyr::distinct()
correct_hugo <- cat_lof_ub_df %>%
dplyr::select(hugo_symbol, corrected_hugo_symbol) %>%
dplyr::distinct()
# Identify genes with both "C19_genes" and another category within "deubiquitination"
genes_with_multiple_categories <- gene_ub_cat %>%
dplyr::filter(ub_category == "deubiquitination") %>%
group_by(corrected_hugo_symbol) %>%
dplyr::filter(n_distinct(category) > 1) %>%
pull(corrected_hugo_symbol) %>%
unique()
# Filter the data frame to remove "C19_genes" for those genes only
gene_ub_cat <- gene_ub_cat %>%
dplyr::filter(!(corrected_hugo_symbol %in% genes_with_multiple_categories & ub_category == "deubiquitination" & category == "C19_genes"))
ub_lof_expression <- ub_expression_long %>%
dplyr::left_join(gene_ub_cat, by="corrected_hugo_symbol")
lof_ub_binary_path <- file.path(results_ubi_db_path,
"lof_ub_binary.rds")
lof_ub_binary <- readRDS(lof_ub_binary_path)
lof_ub_binary_filtered <- lof_ub_binary %>%
dplyr::filter(rowSums(dplyr::select(., -ensembl_id, -hugo_symbol)) != 0) %>%
dplyr::left_join(correct_hugo, by="hugo_symbol") %>%
dplyr::select(-ensembl_id, -hugo_symbol) %>%
dplyr::select(corrected_hugo_symbol, everything())
# Pivot lof_ub_binary_filtered to long format
lof_ub_long <- lof_ub_binary_filtered %>%
pivot_longer(cols = -corrected_hugo_symbol,
names_to = "SampleID",
values_to = "ubi_lof")
# Join the data frames
ub_lof_expression_presence <- ub_lof_expression %>%
dplyr::left_join(lof_ub_long, by = c("SampleID", "corrected_hugo_symbol")) %>%
dplyr::mutate(ubi_lof = as.character(ubi_lof)) %>%
dplyr::mutate(ubi_lof = ifelse(is.na(ubi_lof), "0", ubi_lof))
# Calculate the mean expression for each SampleID
median_expression <- ub_lof_expression_presence %>%
group_by(SampleID) %>%
summarise(median_expression = median(expression, na.rm = TRUE)) %>%
arrange(desc(median_expression))
# Reorder SampleID based on mean expression
ub_lof_expression_presence$SampleID <- factor(ub_lof_expression_presence$SampleID, levels = median_expression$SampleID)
# Response plot
response_plot <- ggplot(ub_lof_expression_presence, aes(x = SampleID, y = "1", fill = Response)) +
geom_tile(color = "black", linewidth = 0.5) +
scale_fill_manual(values = c("R" = "#009E73", "NR" = "#D55E00"),
labels = c("R" = "Responders", "NR" = "Non-responders")) +
labs(x = NULL,
y = NULL,
fill = "Patient Response",
title = NULL) +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "right",
legend.spacing.y = unit(0.1, 'cm'),
legend.background = element_rect(fill = "white", color = NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", color = NA),
plot.margin = margin(0, 0, 0, 0, "pt"))
# Plot using gradient for expression
plot <- ggplot(ub_lof_expression_presence, aes(x = SampleID, y = corrected_hugo_symbol, fill = expression)) +
geom_tile(color = "black", linewidth = 0.5, na.rm = FALSE) +
scale_fill_gradient2(low = "#F0E442", mid = "#009E73", high = "#0072B2", midpoint = 0.5, na.value = "#555555") +
geom_point(data = ub_lof_expression_presence %>% dplyr::filter(ubi_lof == "1"),
aes(shape = "Present"), color = "#CC79A7", size = 3, show.legend = TRUE) +
geom_point(data = ub_lof_expression_presence, aes(size = "NA"), shape = NA, color = "#555555")+
guides(size=guide_legend("Missing expression", override.aes=list(shape=15, size = 6))) +
labs(x = "Sample ID", y = "Genes with LoF mutation", fill = "Expression Level", shape = "LoF mutation") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "right",
legend.spacing.y = unit(0.1, 'cm'),
legend.background = element_rect(fill = "white", color = NA),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", colour = NA),
plot.margin = margin(0, 0, 0, 0, "pt"))
# Extract legends
legend0 <- get_legend(response_plot)
## Warning in get_plot_component(plot, "guide-box"): Multiple components found;
## returning the first one. To return all, use `return_all = TRUE`.
legend1 <- get_legend(plot)
## Warning: Using size for a discrete variable is not advised.
## Warning: Removed 2222 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in get_plot_component(plot, "guide-box"): Multiple components found;
## returning the first one. To return all, use `return_all = TRUE`.
# Combine plots without legends
combined_plot <- response_plot + plot + plot_layout(nrow = 2, ncol = 1, heights = c(0.25, 14)) & theme(legend.position = "none")
# Combine legends
combined_legends <- plot_grid(as_ggplot(legend0) + theme(plot.margin = unit(c(0, 0, 0, 0), "pt")),
as_ggplot(legend1) + theme(plot.margin = unit(c(20, 0, 0, 0), "pt")),
ncol = 2, align = "v", rel_widths = c(1, 1))
# Combine plots and legends
final_plot <- plot_grid(combined_plot, combined_legends, ncol = 2, rel_widths = c(4, 2), rel_heights = c(20, 1)) +
theme(plot.background = element_rect(fill = "white", color = NA))
## Warning: Using size for a discrete variable is not advised.
## Warning: Removed 2222 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Print the final plot
print(final_plot)
# Save the ggplot
plot_path <- file.path(figures_expression_path,
"ubiquitin_lof_presence_and_expression.png")
ggsave(plot_path, final_plot, width = 10, height = 14, dpi = 300)
datatable(biomarkers_ubi_exp_df,
extensions = c('Buttons',
'FixedColumns'),
options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv'),
scrollX=TRUE,
pageLength=8,
fixedColumns = list(leftColumns = 2), # Freeze the first 2 columns
columnDefs = list(list(
targets = "_all",
render = JS(
"function(data, type, row, meta) {",
" return data === null ? 'NA' : data;",
"}"
)
))
),
caption = "Expression analysis of Ubiquitin-related genes with LoF in samples."
)
datatable(biomarkers_proteas_exp_df,
extensions = c('Buttons',
'FixedColumns'),
options = list(
dom = 'Bfrtip',
buttons = c('copy', 'excel', 'csv'),
scrollX=TRUE,
pageLength=8,
fixedColumns = list(leftColumns = 2), # Freeze the first 2 columns
columnDefs = list(list(
targets = "_all",
render = JS(
"function(data, type, row, meta) {",
" return data === null ? 'NA' : data;",
"}"
)
))
),
caption = "Expression analysis of Proteasome subunits."
)
#Fig13
proteas_boxplot_path <- file.path(figures_expression_path, "proteasome_boxplot.png")
proteas_boxplot <- create_proteas_boxplot(biomarkers_proteas_exp_df, biomarkers_proteas_exp_vec)
ggsave(proteas_boxplot_path, proteas_boxplot, width = 12, height = 8, dpi = 300)
print(proteas_boxplot)
# Loop through the biomarkers and generate a boxplot for each
plots <- list()
for (biomarker in c("TumorPurity")) {
p <- create_biomarker_boxplot(biomarkers_clinical_ubi_counts, biomarker)
plots[[biomarker]] <- p
}
# Print the plots
for (biomarker in c("TumorPurity")) {
print(plots[[biomarker]])
}
# Prepare data
# Count the number of mutations per sample_id
mutations_per_sample <- cat_lof_ub_df %>%
dplyr::group_by(sample_id) %>%
dplyr::summarise(mutation_count = n(), .groups = 'drop') %>%
dplyr::left_join(biomarkers_clinical_ubi_counts %>% dplyr::select(SampleID, TumorPurity),
by=c("sample_id" = "SampleID"))
# Add samples with no ubi LoF
remaining_samples <- biomarkers_clinical_ubi_counts %>%
dplyr::select(SampleID, TumorPurity) %>%
dplyr::filter(!(SampleID %in% mutations_per_sample$sample_id)) %>%
dplyr::mutate(mutation_count = 0) %>%
dplyr::rename(sample_id = SampleID)
# Combine both data frames
complete_samples <- bind_rows(mutations_per_sample, remaining_samples)
# Response_df
response_df <- biomarkers_clinical_ubi_counts %>%
dplyr::rename(sample_id = SampleID) %>%
dplyr::select(sample_id, Response) %>%
dplyr::distinct()
prepared_data_df <- mutations_per_sample %>%
dplyr::left_join(response_df, by="sample_id") %>%
dplyr::mutate(short_sample_id = sub(".*_(\\d+)$", "\\1", sample_id))
# Fig16
# Calculate regression statistics
regression_model <- lm(mutation_count ~ TumorPurity + Response, data = prepared_data_df)
regression_summary <- summary(regression_model)
# Extract the p-value for Intercept
p_value <- broom::tidy(regression_model) %>%
dplyr::filter(term == "(Intercept)") %>%
pull(p.value)
# Create scatter plot with regression line
scatter_plot <- ggplot(prepared_data_df, aes(x = TumorPurity, y = mutation_count, color = Response)) +
geom_point(size = 3) +
geom_label_repel(aes(label = short_sample_id), size = 3, color = "black",
max.overlaps = 50, max.time = 2, alpha = 0.5, fill = "white") +
scale_color_manual(values = c("R" = "#009E73", "NR" = "#D55E00"),
labels = c("R" = "Responders", "NR" = "Non-responders")) +
geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed") +
labs(title = NULL,
x = "Tumor Purity",
y = "nº of LoF mutations",
color = "Patient Response") +
scale_y_continuous(breaks = seq(0, 25, by = 5),
minor_breaks = seq(0, 25, by = 5)) +
scale_x_continuous(breaks = seq(0, 1, by = 0.1),
minor_breaks = seq(0, 1, by = 0.1)) +
coord_cartesian(ylim = c(0, NA)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", colour = "black"),
plot.background = element_rect(fill = "white", colour = NA)) +
annotate("text", x = Inf, y = Inf, label = paste("p-value:", format(p_value, digits = 3)),
hjust = 1.5, vjust = 18, size = 4, color = "black")
# ggsave
ggsave(file.path(figures_expression_path, "regression_tp_ubi.png"), scatter_plot, width = 9, height = 5, dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'
# Print the scatter plot
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'
Looks like it is not correlated significatively (p>0.05).
sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.7.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Copenhagen
## tzcode source: internal
##
## attached base packages:
## [1] stats4 grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] tidyHeatmap_1.8.1 ComplexHeatmap_2.18.0
## [3] GO.db_3.18.0 mygene_1.38.0
## [5] ggupset_0.3.0 clusterProfiler_4.10.1
## [7] limma_3.58.1 fgsea_1.28.0
## [9] EnsDb.Hsapiens.v86_2.99.0 EnsDb.Hsapiens.v75_2.99.0
## [11] ensembldb_2.26.0 AnnotationFilter_1.26.0
## [13] GenomicFeatures_1.54.4 GSEABase_1.64.0
## [15] graph_1.80.0 annotate_1.80.0
## [17] XML_3.99-0.16.1 AnnotationDbi_1.64.1
## [19] maftools_2.18.0 biomaRt_2.58.2
## [21] VariantAnnotation_1.48.1 Rsamtools_2.18.0
## [23] Biostrings_2.70.3 XVector_0.42.0
## [25] SummarizedExperiment_1.32.0 Biobase_2.62.0
## [27] GenomicRanges_1.54.1 GenomeInfoDb_1.38.8
## [29] IRanges_2.36.0 S4Vectors_0.40.2
## [31] MatrixGenerics_1.14.0 matrixStats_1.3.0
## [33] BiocGenerics_0.48.1 magrittr_2.0.3
## [35] dendextend_1.17.1 corrplot_0.92
## [37] broom_1.0.5 ggrepel_0.9.5
## [39] ggplotify_0.1.2 pheatmap_1.0.12
## [41] cowplot_1.1.3 ggnewscale_0.4.10
## [43] reshape2_1.4.4 ggdendro_0.2.0
## [45] ggpubr_0.6.0 patchwork_1.2.0
## [47] gridtext_0.1.5 circlize_0.4.16
## [49] VennDiagram_1.7.3 futile.logger_1.4.3
## [51] HGNChelper_0.8.1 msigdbr_7.5.1
## [53] jsonlite_1.8.8 httr_1.4.7
## [55] DT_0.33 knitr_1.45
## [57] lubridate_1.9.3 forcats_1.0.0
## [59] stringr_1.5.1 dplyr_1.1.4
## [61] purrr_1.0.2 readr_2.1.5
## [63] tidyr_1.3.1 tibble_3.2.1
## [65] ggplot2_3.5.1 tidyverse_2.0.0
## [67] readxl_1.4.3 rprojroot_2.0.4
##
## loaded via a namespace (and not attached):
## [1] progress_1.2.3 nnet_7.3-19
## [3] HDF5Array_1.30.1 vctrs_0.6.5
## [5] digest_0.6.35 png_0.1-8
## [7] shape_1.4.6.1 proxy_0.4-27
## [9] MASS_7.3-60.0.1 foreach_1.5.2
## [11] qvalue_2.34.0 withr_3.0.0
## [13] xfun_0.44 ggfun_0.1.4
## [15] survival_3.6-4 memoise_2.0.1
## [17] gson_0.1.0 systemfonts_1.1.0
## [19] ragg_1.3.2 tidytree_0.4.6
## [21] zoo_1.8-12 GlobalOptions_0.1.2
## [23] DNAcopy_1.76.0 Formula_1.2-5
## [25] prettyunits_1.2.0 KEGGREST_1.42.0
## [27] rstatix_0.7.2 restfulr_0.0.15
## [29] rhdf5filters_1.14.1 rhdf5_2.46.1
## [31] rstudioapi_0.16.0 generics_0.1.3
## [33] DOSE_3.28.2 base64enc_0.1-3
## [35] babelgene_22.9 curl_5.2.1
## [37] zlibbioc_1.48.2 ScaledMatrix_1.10.0
## [39] ggraph_2.2.1 polyclip_1.10-6
## [41] GenomeInfoDbData_1.2.11 quadprog_1.5-8
## [43] SparseArray_1.2.4 xtable_1.8-4
## [45] doParallel_1.0.17 evaluate_0.23
## [47] S4Arrays_1.2.1 BiocFileCache_2.10.2
## [49] preprocessCore_1.64.0 hms_1.1.3
## [51] glmnet_4.1-8 irlba_2.3.5.1
## [53] colorspace_2.1-0 filelock_1.0.3
## [55] viridis_0.6.5 ggtree_3.10.1
## [57] limSolve_1.5.7.1 lattice_0.22-6
## [59] shadowtext_0.1.3 class_7.3-22
## [61] Hmisc_5.1-2 pillar_1.9.0
## [63] nlme_3.1-164 iterators_1.0.14
## [65] beachmat_2.18.1 compiler_4.3.2
## [67] stringi_1.8.4 GenomicAlignments_1.38.2
## [69] plyr_1.8.9 crayon_1.5.2
## [71] abind_1.4-5 BiocIO_1.12.0
## [73] gridGraphics_0.5-1 chron_2.3-61
## [75] locfit_1.5-9.9 graphlayouts_1.1.1
## [77] org.Hs.eg.db_3.18.0 bit_4.0.5
## [79] fastmatch_1.1-4 textshaping_0.3.7
## [81] codetools_0.2-20 BiocSingular_1.18.0
## [83] crosstalk_1.2.1 bslib_0.7.0
## [85] e1071_1.7-14 GetoptLong_1.0.5
## [87] splines_4.3.2 Rcpp_1.0.12
## [89] dbplyr_2.5.0 sparseMatrixStats_1.14.0
## [91] HDO.db_0.99.1 cellranger_1.1.0
## [93] blob_1.2.4 utf8_1.2.4
## [95] clue_0.3-65 fs_1.6.4
## [97] checkmate_2.3.1 DelayedMatrixStats_1.24.0
## [99] GSVA_1.50.5 ggsignif_0.6.4
## [101] sqldf_0.4-11 Matrix_1.6-5
## [103] statmod_1.5.0 tzdb_0.4.0
## [105] lpSolve_5.6.20 tweenr_2.0.3
## [107] pkgconfig_2.0.3 tools_4.3.2
## [109] cachem_1.0.8 RSQLite_2.3.6
## [111] viridisLite_0.4.2 DBI_1.2.2
## [113] fastmap_1.2.0 rmarkdown_2.26
## [115] scales_1.3.0 sass_0.4.9
## [117] BiocManager_1.30.23 carData_3.0-5
## [119] rpart_4.1.23 farver_2.1.2
## [121] mgcv_1.9-1 survminer_0.4.9
## [123] tidygraph_1.3.1 scatterpie_0.2.2
## [125] gsubfn_0.7 yaml_2.3.8
## [127] foreign_0.8-86 rtracklayer_1.62.0
## [129] cli_3.6.2 lifecycle_1.0.4
## [131] lambda.r_1.2.4 backports_1.4.1
## [133] BiocParallel_1.36.0 timechange_0.3.0
## [135] gtable_0.3.5 rjson_0.2.21
## [137] parallel_4.3.2 ape_5.8
## [139] bitops_1.0-7 bit64_4.0.5
## [141] yulab.utils_0.1.4 proto_1.0.0
## [143] futile.options_1.0.1 highr_0.10
## [145] jquerylib_0.1.4 GOSemSim_2.28.1
## [147] survMisc_0.5.6 lazyeval_0.2.2
## [149] htmltools_0.5.8.1 enrichplot_1.22.0
## [151] KMsurv_0.1-5 rappdirs_0.3.3
## [153] formatR_1.14 glue_1.7.0
## [155] RCurl_1.98-1.14 treeio_1.26.0
## [157] BSgenome_1.70.2 gridExtra_2.3
## [159] igraph_2.0.3 R6_2.5.1
## [161] DESeq2_1.42.1 SingleCellExperiment_1.24.0
## [163] labeling_0.4.3 km.ci_0.5-6
## [165] cluster_2.1.6 Rhdf5lib_1.24.2
## [167] aplot_0.2.2 DelayedArray_0.28.0
## [169] tidyselect_1.2.1 ProtGenerics_1.34.0
## [171] htmlTable_2.4.2 ggforce_0.4.2
## [173] xml2_1.3.6 car_3.1-2
## [175] rsvd_1.0.5 munsell_0.5.1
## [177] data.table_1.15.4 htmlwidgets_1.6.4
## [179] RColorBrewer_1.1-3 rlang_1.1.3
## [181] IOBR_0.99.8 fansi_1.0.6